Given that we are able to create a spectral tree that respects taxonomic relationships and even captures sub-species groupings that corresponds with host environment, we were interested in understanding what genomic signatures were describing these sub-species groupings. However, because these sub-species groups a specifically defined by these genomic features, assumptions of independence typical to statistical tests are violated. Therefore we use a more stringent test to determine whether a given OGG is differentially abundant by using how likely it is for any of the features we are looking at to be significant before deciding if each individual OGG feature is significant.
Specifically to determine whether a given OGG is differentially abundant in a statistically significant way across two groups (‘group A’ and ‘group B’) that arise from a common node in the Spectral Tree, we first generate a ‘reference p-value’ and a ‘null distribution’. The reference p-value signifies the difference in \(OGG_i\) across the two groups (Wilcoxon test). To generate a null distribution of p-values, we create 100 permutations of the distributions between groups A and B that randomly shuffles entries in the two groups. For all OGGs within the Spectral Tree, we compute a distribution of the most significant p-values across all permutations. The Q-value for a given OGG is computed as the number of p-values in the null distribution that are less than the reference p-value (\(P\)) relative to the total number of p-values in the null distribution (equation).
┌ Warning: Cannot join columns with the same name because var_names are intersecting.
└ @ Muon /Users/bend/.julia/packages/Muon/eLqpV/src/mudata.jl:351
keptspecies =sort(filter(x->last(x) >=20, countmap(biobank.obs.Species)), byvalue=true, rev=true)filter!(!=("unclassified"), keptspecies)# species => number of strains belonging to that species
functioncliffs_d(a, b) ans=0.for i in a, j in bans+=sign(i - j)endans/ (length(a) *length(b))endfunctionlog2FC(a, b) mean(log2.(a .+1)) -mean(log2.(b .+1))end
log2FC (generic function with 1 method)
significance tests across full strain variation tree
seed!(123456789)tree = strvartreemaxtreeheight =last(maximum(getheights(tree)))leaves =getleaves(tree)leafnames =name.(leaves);treeorder =indexin(leafnames, strvar_obsnames);ogg_mtx = biobank["oggs"].X[:,:][biobank.obs.kept_species.==1,:]ogg_mask =vec(mapslices(c->std(c) >0, ogg_mtx, dims=1))ogg_mtx = ogg_mtx[treeorder, ogg_mask]@showsize(ogg_mtx)ogg_names = biobank["oggs"].var_names[ogg_mask]ogg_freqs =vec(mean(ogg_mtx .>0, dims=1))taxon_info = strvarobs[treeorder, :]taxon_info.species_donor =join.(eachrow(strvarobs[treeorder, [:Species, :Donor]]), " ")isolate_ids =getleafnames(tree)testresults =DataFrame()for node inprewalk(tree)# if group size would be less than 1 skip node (isleaf(node) ||any(isleaf.(node.children))) &&continue# find comparison groups grp1_idx =indexin(getleafnames(node.children[1]), isolate_ids) grp2_idx =indexin(getleafnames(node.children[2]), isolate_ids) pvals =mapslices(ogg_mtx, dims=1) do columnpvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))end|> vec# make nullseed!(1234) null_qs =map(1:100) do i n =length(grp1_idx) m =length(grp2_idx) perm_idx =shuffle(vcat(grp1_idx, grp2_idx)) qs =mapslices(ogg_mtx, dims=1) do column_perm q =pvalue(MannWhitneyUTest(column_perm[perm_idx[1:n]], column_perm[perm_idx[n+1:end]]))endminimum(qs)end qvals =mapslices(ogg_mtx, dims=1) do column pval =pvalue(MannWhitneyUTest(column[grp1_idx], column[grp2_idx]))mean(null_qs .< pval)end|> vec effects =mapslices(ogg_mtx, dims=1) do columnabs(mean(column[grp1_idx]) -mean(column[grp2_idx]) /std(column))end|> vec log_effects =mapslices(ogg_mtx, dims=1) do column logcol =log2.(column.+1)abs(mean(logcol[grp1_idx]) -mean(logcol[grp2_idx]) /std(logcol))end|> vec log2FCs =mapslices(ogg_mtx, dims=1) do columnlog2FC(column[grp1_idx], column[grp2_idx])end|> vec cliffs_ds =mapslices(ogg_mtx, dims=1) do columncliffs_d(column[grp1_idx], column[grp2_idx])end|> vec grp1_prp_exp =mapslices(ogg_mtx, dims=1) do columnmean(column[grp1_idx] .>0)end|> vec grp2_prp_exp =mapslices(ogg_mtx, dims=1) do columnmean(column[grp2_idx] .>0)end|> vec testresults =vcat(testresults, DataFrame(:nodeids =>id(node),:nodeheight => NewickTree.height(node),:nodedepth => maxtreeheight - NewickTree.height(node),:grp1_N =>length(grp1_idx),:grp2_N =>length(grp2_idx),:grp1_phylum_mode =>mode(taxon_info.Phylum[grp1_idx]),:grp2_phylum_mode =>mode(taxon_info.Phylum[grp2_idx]),:grp1_species_mode =>mode(taxon_info.Species[grp1_idx]),:grp2_species_mode =>mode(taxon_info.Species[grp2_idx]),:grp1_species_donor_mode =>mode(taxon_info.species_donor[grp1_idx]),:grp2_species_donor_mode =>mode(taxon_info.species_donor[grp2_idx]),:ogg_name => ogg_names,:ogg_idx =>collect(axes(ogg_mtx, 2)),:ogg_freqs => ogg_freqs,:grp1_prp_exp => grp1_prp_exp,:grp2_prp_exp => grp2_prp_exp,:effectsize => effects,:logeffectsize => log_effects,:log2FC => log2FCs,:cliffs_d => cliffs_ds,:pvals => pvals,:qvals => qvals, ) )endtestresults[!, "pval_BH"] .=adjust(testresults.pvals, BenjaminiHochberg());testresults[!, "pval_Bon"] .=adjust(testresults.pvals, Bonferroni());testresults[!, "qval_BH"] .=adjust(testresults.qvals, BenjaminiHochberg());tree_discretization =cut(testresults.nodedepth, [0, 2, 4, 5])recode!(tree_discretization, "[4, 5)"=>"phylum level", "[2, 4)"=>"species level","[0, 2)"=>"strain level")testresults[!, "tree_level"] = tree_discretizationtestresults;
# how many OGGs are still duplicated?sigtestresults |> df ->groupby(df, :ogg_name) |> df ->subset(df, :log2FC => x ->abs.(x) .==maximum(abs.(x))) |> df ->countmap(df.ogg_name) |> cm ->sum(values(cm) .>1)
27
# for these remaining OGGs select the shallowest significant exampleuniqueOGGresults = sigtestresults |> df ->groupby(df, :ogg_name) |> df ->subset(df, :log2FC => x ->abs.(x) .==maximum(abs.(x))) |> df ->groupby(df, :ogg_name) |> df ->subset(df, :nodeheight => x -> x .==minimum(x));@shownrow(uniqueOGGresults) ==length(unique(uniqueOGGresults.ogg_name))@showsum(uniqueOGGresults.tree_level .=="phylum level")@showsum(uniqueOGGresults.tree_level .=="species level")@showsum(uniqueOGGresults.tree_level .=="strain level");
ogg_plotting_order = uniqueOGGresults |># select test at root of rectale sub treedf->subset(df, :nodeids => x-> x.==(id(nthparent(basenode, 3)))) |>df->select(df, [:nodeids, :log2FC, :grp1_N, :grp2_N, :ogg_name, :ogg_idx]) |># add in descriptions of each oggdf->leftjoin(df, uniprot.var[:, [:og, :description]], on =:ogg_name =>:og) |># sort oggs based on abundence per groupdf->sort(df, [:nodeids, :log2FC]) |># get shorter descriptions for lablesdf->coalesce.(df, "") |>df->transform(df, :description =>ByRow(x->first(replace.(x, r"\(.*\)"=>""), 75)) =>:ogg_labels );
# more abundant in MSK.17 and MSK.22subset(ogg_plotting_order,:log2FC =>ByRow(<(0)),:description =>ByRow(!=("")))
17×8 DataFrame
Row
nodeids
log2FC
grp1_N
grp2_N
ogg_name
ogg_idx
description
ogg_labels
Int64
Float64
Int64
Int64
String15
Int64
String
String
1
433
-2.52345
7
13
COG1345
3757
Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end
Required for morphogenesis and for the elongation of the flagellar filament
┌ Info: string: 'rectale' finds 1 taxa:
│ 1×7 DataFrame
│ Row │ Kingdom Phylum Class Order Family Genus Species
│ │ String String String String String String String
│ ─────┼─────────────────────────────────────────────────────────────────────────────────────────────────
│ 1 │ Bacteria Firmicutes Clostridia Clostridiales Lachnospiraceae [Eubacterium] rectale
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/04_figure_04.ipynb:3
test_oggs =subset(ogg_plotting_order,:description =>ByRow(!=("")), # drop unannotated oggs:description =>ByRow(!=("domain, Protein")), # drop unannotated oggs:log2FC =>ByRow(<(0)), # drop oggs increased in the presence of phage).ogg_nameuniprot.var[indexin(sort(test_oggs), uniprot.var_names), [:og, :description]]
16×2 DataFrame
Row
og
description
String
String
1
COG0835
chemotaxis
2
COG0842
Transport permease protein
3
COG1344
bacterial-type flagellum-dependent cell motility
4
COG1345
Required for morphogenesis and for the elongation of the flagellar filament by facilitating polymerization of the flagellin monomers at the tip of growing filament. Forms a capping structure, which prevents flagellin subunits (transported through the central channel of the flagellum) from leaking out without polymerization at the distal end
protein involved in cytokinesis, contains TGc (transglutaminase protease-like) domain
16
COG5444
nuclease activity
# oggs from E. rectale test that are in Uniprotchosen_oggs_idx =indexin(test_oggs, uniprot.var_names);# make presence/absence matix `subset_ogg_mtx`subset_ogg_mtx = upmtx[:, chosen_oggs_idx] .>0;@show chosen_oggs;
# collect parent nodes back to pseudo-root of uniprot treespecies_node_idx =findall(contains.(uniprot.obs.Species[treeorder], species_name))[1]speciesnode = leafnodes[species_node_idx]depthofspecies =network_distance(uptree, speciesnode)speciesparents = []node = speciesnodefor i in1:depthofspecies node =parent(node)push!(speciesparents, node)endcladesizes =length.(getleafnames.(speciesparents))@info"# of taxa per clade = $(cladesizes)"
┌ Info: # of taxa per clade = [3, 10, 23, 101, 216, 265, 380, 600, 733, 1022, 1065, 2376, 6709, 7047]
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/04_figure_04.ipynb:12
# for each node find all children and measure percentage of children where # chosen oggs are presentcladesateachtreelevel =getleafnames.(speciesparents);frac_present =map(cladesateachtreelevel) do clade_ids cladeidx =indexin(clade_ids, uniprot.obs_names)mean(subset_ogg_mtx[cladeidx, :], dims=1) |> vecend|> stack# check how many oggs (n=16) are fully conserved across the smallest clade (n=3)islocallypresent = frac_present[:, 1] .==1;@info"# locally present = $(sum(islocallypresent)), # not locally present = $(sum(.!islocallypresent))"numcols =length(cladesizes);
┌ Info: # locally present = 12, # not locally present = 4
└ @ Main /Users/bend/projects/Doran_etal_2023/notebooks/04_figure_04.ipynb:11
# plot the locally present oggs up through the treeplot( xticks=(1:numcols, string.(cladesizes)), xrotation=45, ylabel="fractional coverage of taxa", xlabel="# taxa in clade", margin=5Plots.Measures.mm, ylims=(0,1), widen=true, size=(600,300), yticks=0:.25:1, legend=:bottomleft,)hline!([.25, .5, .75], c=:grey, alpha=.5, linestyle=:dash)violin!(permutedims(1:numcols), frac_present[islocallypresent, :], alpha=.3, c="#6182ce")dotplot!(permutedims(1:numcols), frac_present[islocallypresent, :], c="#6182ce", alpha=.5, mode=:none, )scatter!([NaN], label=permutedims(["$(species_name) locally conserved OGG (n=$(sum(islocallypresent)))"]), c=["#6182ce"], alpha=[0.5] )
What phage related differences are present for other species?
function to plot sample oggs across distribution of oggs in uniprot
Exploring these tables of test results revealed other instances of phage related strain level differences. For P. vulgatus, M. gnavus, B. thetaiotaomicron, and C. comes we found cases where OGGs associated with phage infection where more abundant in multiple strains in contrast to other “phage suppressed OGGs”. We found no specificity in degree of conservation for these phage suppressed OGGs, highly conserved OGGs were just as likely to be suppressed as OGGs only found in a small number of taxa across all of UniProt.
P. vulgatus
species_title ="Phocaeicola vulgatus"targetid ="DFI.3.23"strvar_leaves =getleaves(strvartree)basenode = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]subtree =readnw(nwstr(nthparent(basenode, 4)))ladderize!(subtree)# tiplabels = join.(eachrow(strvarobs[indexin(getleafnames(subtree), strvarobs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")# idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))# rename_treeleaves!(subtree, idmapping)plot(subtree, c=:black, lw=3, fs=5, size=(600,600), rightmargin=3Plots.cm, leftmargin=10Plots.mm, ylabel=species_title *" (n = $(length(getleaves(subtree))))", framestyle=:grid, ticks=false,)# find and plot test nodes in subtreeleaves_of_test_nodes =map(prewalk(nthparent(basenode, 4))) do node; test_nodes=unique(strain_level_testresults.nodeids)if (Int(id(node)) ∈ test_nodes) &&minimum(length.(getleaves.(children(node)))) >6join(sort(getleafnames(node)), ";")elsenothingendend|>x->filter(!isnothing, x)subtreepos = NewickTree.treepositions(subtree)testnode_positions =map(keys(subtreepos), values(subtreepos)) do k,v x, y = v label =join(sort(getleafnames(k)), ";")if label ∈ leaves_of_test_nodes (; x, y, label)elsenothingendend|>x->filter(!isnothing, x)@dfDataFrame(testnode_positions) scatter!(:x, :y, markersize=6, c=:yellow, markershape=:diamond)
# collected by manually searching through strain-level test resultsnotable_oggs = ["COG0863","COG1083","COG4744", "32ZV1", "COG0666","COG2214","2Z8QV"]plot_conserved_oggs("P. vulgatus", notable_oggs)
M. gnavus
species_title ="[Ruminococcus] gnavus"targetid ="MSK.22.91"strvar_leaves =getleaves(strvartree)basenode = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]subtreeroot =nthparent(basenode, 5)subtree =readnw(nwstr(subtreeroot))ladderize!(subtree)# tiplabels = join.(eachrow(strvarobs[indexin(getleafnames(subtree), strvarobs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")# idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))# rename_treeleaves!(subtree, idmapping)plot(subtree, c=:black, lw=3, fs=5, size=(600,600), rightmargin=3Plots.cm, leftmargin=10Plots.mm, ylabel="M. gnavus (n = 41)", framestyle=:grid, ticks=false,)# find and plot test nodes in subtreeleaves_of_test_nodes =map(prewalk(subtreeroot)) do node; test_nodes=unique(strain_level_testresults.nodeids)if (Int(id(node)) ∈ test_nodes) &&minimum(length.(getleaves.(children(node)))) >=4join(sort(getleafnames(node)), ";")elsenothingendend|>x->filter(!isnothing, x)subtreepos = NewickTree.treepositions(subtree)testnode_positions =map(keys(subtreepos), values(subtreepos)) do k,v x, y = v label =join(sort(getleafnames(k)), ";")if label ∈ leaves_of_test_nodes (; x, y, label)elsenothingendend|>x->filter(!isnothing, x)@dfDataFrame(testnode_positions) scatter!(:x, :y, markersize=6, c=:yellow, markershape=:diamond)
# collected by manually searching through strain-level test resultsnotable_oggs = ["COG1440","COG1447","COG1694", "COG0531","COG1501","COG2222"]plot_conserved_oggs("M. gnavus", notable_oggs)
B. thetaiotaomicron
species_title ="Bacteroides thetaiotaomicron"targetid ="DFI.4.108"strvar_leaves =getleaves(strvartree)basenode = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]subtreeroot =nthparent(basenode, 4)subtree =readnw(nwstr(subtreeroot))ladderize!(subtree)# tiplabels = join.(eachrow(strvarobs[indexin(getleafnames(subtree), strvarobs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")# idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))# rename_treeleaves!(subtree, idmapping)plot(subtree, c=:black, lw=3, fs=5, size=(600,600), rightmargin=3Plots.cm, leftmargin=10Plots.mm, ylabel=species_title *" (n = $(length(getleaves(subtree))))", framestyle=:grid, ticks=false,)# find and plot phage related test nodes in subtreeleaves_of_test_nodes =map(prewalk(subtreeroot)) do node; test_nodes=unique(strain_level_testresults.nodeids)if (Int(id(node)) ∈ test_nodes) &&minimum(length.(getleaves.(children(node)))) ==7join(sort(getleafnames(node)), ";")elsenothingendend|>x->filter(!isnothing, x)subtreepos = NewickTree.treepositions(subtree)testnode_positions =map(keys(subtreepos), values(subtreepos)) do k,v x, y = v label =join(sort(getleafnames(k)), ";")if label ∈ leaves_of_test_nodes (; x, y, label)elsenothingendend|>x->filter(!isnothing, x)@dfDataFrame(testnode_positions) scatter!(:x, :y, markersize=6, c=:yellow, markershape=:diamond)
# collected by manually searching through strain-level test resultsnotable_oggs = ["COG1474","COG2942","COG3410"]plot_conserved_oggs("B. thetaiotaomicron", notable_oggs)
C. comes
species_title ="Coprococcus comes"targetid ="MSK.20.88"strvar_leaves =getleaves(strvartree)basenode = strvar_leaves[findfirst(n->name(n) == targetid, strvar_leaves)]subtreeroot =nthparent(basenode, 6)subtree =readnw(nwstr(subtreeroot))ladderize!(subtree)# tiplabels = join.(eachrow(strvarobs[indexin(getleafnames(subtree), strvarobs.Strain_ID), [:Strain_ID, :NCBI_Species]]), " ")# idmapping = Dict(k=>v for (k,v) in zip(getleafnames(subtree), tiplabels))# rename_treeleaves!(subtree, idmapping)plot(subtree, c=:black, lw=3, fs=5, size=(600,600), rightmargin=3Plots.cm, leftmargin=10Plots.mm, ylabel=species_title *" (n = $(length(getleaves(subtree))))", framestyle=:grid, ticks=false,)# find and plot phage related test nodes in subtreeleaves_of_test_nodes =map(prewalk(subtreeroot)) do node; test_nodes=unique(strain_level_testresults.nodeids)if (Int(id(node)) ∈ test_nodes) &&minimum(length.(getleaves.(children(node)))) ==5join(sort(getleafnames(node)), ";")elsenothingendend|>x->filter(!isnothing, x)subtreepos = NewickTree.treepositions(subtree)testnode_positions =map(keys(subtreepos), values(subtreepos)) do k,v x, y = v label =join(sort(getleafnames(k)), ";")if label ∈ leaves_of_test_nodes (; x, y, label)elsenothingendend|>x->filter(!isnothing, x)@dfDataFrame(testnode_positions) scatter!(:x, :y, markersize=6, c=:yellow, markershape=:diamond)
# collected by manually searching through strain-level test resultsnotable_oggs = ["COG0145","COG1011","COG1351","COG4951","2Z81G",]plot_conserved_oggs("C. comes", notable_oggs)